# Alex F. Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(here)
library(tidylog)
library(sf)
library(tigris)
library(tidycensus)

g <- readRDS(here("data", "inter", "bgm_processed.rds"))

# Prepare additional variables needed for credits analysis
g <- g %>%
    mutate(
        # Clean and standardize all numeric variables
        across(c(capital_investment_million, targeted_annual_production, realized_annual_production, target_jobs, current_jobs), ~ case_when(
            .x %in% c("?", "NA", "None") ~ NA_character_,
            str_detect(.x, "\\+$") ~ str_remove(.x, "\\+"),  # Remove + signs (e.g., "200+" -> "200")
            TRUE ~ .x
        )),
        # Convert to numeric
        across(c(capital_investment_million, targeted_annual_production, realized_annual_production, target_jobs, current_jobs), ~ as.numeric(.x))
    ) %>%
    # Rename for consistency
    rename(invest_m = capital_investment_million) %>%
    # Create all binary indicators in one operation
    mutate(
        # Basic binary indicators (handle NAs by treating as 0)
        capital_bin = as.integer(coalesce(invest_m > 0, FALSE)),
        current_jobs_bin = as.integer(coalesce(current_jobs > 0, FALSE)),
        target_jobs_bin = as.integer(coalesce(target_jobs > 0, FALSE)),
        jobs_target = as.integer(!is.na(target_jobs)),
        # Investment thresholds
        invest_100m = as.integer(coalesce(invest_m > 100, FALSE)),
        invest_250m = as.integer(coalesce(invest_m > 250, FALSE)),
        invest_500m = as.integer(coalesce(invest_m > 500, FALSE)),
        # Job thresholds
        jobs_100 = as.integer(coalesce(target_jobs > 100, FALSE)),
        jobs_500 = as.integer(coalesce(target_jobs > 500, FALSE)),
        jobs_1000 = as.integer(coalesce(target_jobs > 1000, FALSE)),
        jobs_5000 = as.integer(coalesce(target_jobs > 5000, FALSE))
    )

# Identify project county FIPS17 based on latitude and longitude
# Only process projects with valid coordinates, preserve others with NA FIPS
county_full <- tigris::counties(cb = TRUE, year = 2017)
county_full$fips <- as.numeric(paste0(county_full$STATEFP, county_full$COUNTYFP))

# Create spatial version for spatial joins (minimal columns for performance)
county_sf <- county_full %>% 
  dplyr::select(fips, geometry)

# Separate projects with valid vs missing coordinates
valid_coords <- g %>% 
  filter(!is.na(longitude) & !is.na(latitude))
missing_coords <- g %>% 
  filter(is.na(longitude) | is.na(latitude))

message(sprintf("Processing FIPS assignment: %d projects with valid coordinates, %d with missing coordinates", 
                nrow(valid_coords), nrow(missing_coords)))

# Only create spatial points for projects with valid coordinates
if (nrow(valid_coords) > 0) {
  turner_points <- st_as_sf(valid_coords, coords = c("longitude", "latitude"), crs = st_crs(county_sf))
  joined <- st_join(turner_points, county_sf[, c("fips")], join = st_within, left = TRUE)
  valid_coords$fips <- joined$fips
  
  # Report initial FIPS assignment results
  successful_fips <- sum(!is.na(joined$fips))
  failed_fips <- sum(is.na(joined$fips))
  
  message(sprintf("Initial FIPS assignment: %d successful, %d failed", successful_fips, failed_fips))
  
  # For failed assignments (likely offshore projects), assign to nearest county
  if(failed_fips > 0) {
    message("Assigning offshore projects to nearest counties...")
    
    # Find points that failed initial assignment
    failed_indices <- which(is.na(valid_coords$fips))
    failed_points <- turner_points[failed_indices, ]
    
    # For each failed point, find the nearest county
    for(i in seq_along(failed_indices)) {
      idx <- failed_indices[i]
      point <- failed_points[i, ]
      
      # Calculate distances to all counties
      distances <- st_distance(point, county_sf)
      nearest_county_idx <- which.min(distances)
      nearest_fips <- county_sf$fips[nearest_county_idx]
      nearest_distance_km <- round(as.numeric(distances[nearest_county_idx]) / 1000, 2)
      
      # Assign the nearest county FIPS
      valid_coords$fips[idx] <- nearest_fips
      
      message(sprintf("  Site %s: assigned to nearest county (FIPS %s, %.2f km away)", 
                      valid_coords$site_id[idx], nearest_fips, nearest_distance_km))
    }
    
    # Report final results
    final_successful <- sum(!is.na(valid_coords$fips))
    message(sprintf("Final FIPS assignment: %d successful (including %d offshore assignments)", 
                    final_successful, failed_fips))
  }
} else {
  valid_coords$fips <- numeric(0)
}

# Projects with missing coordinates will have NA FIPS
if (nrow(missing_coords) > 0) {
  missing_coords$fips <- NA_real_
  message(sprintf("%d projects have missing coordinates and will have NA FIPS (manual corrections applied below)", 
                  nrow(missing_coords)))
}

# Recombine all projects and clean site_id
g <- bind_rows(valid_coords, missing_coords) %>%
  mutate(site_id = str_remove(site_id, "\\.0"))

# Apply manual FIPS corrections for specific known cases
g <- g %>%
  mutate(
    fips = case_when(
      site_id == "366" ~ 08001,  # Adams County, CO
      site_id == "597" ~ 6087,  # Santa Clara County, CA
      site_id == "704" ~ 42003,
      TRUE ~ fips
    ),
    # Create analysis indicators for operating status
    construction = ifelse(operating_status %in% c("Under Construction", "Planned", "Pilot"), 1, 0),
    operating = ifelse(operating_status %in% c("Operating", "Operating Partially; Under Construction"), 1, 0)
  ) %>%
  # Remove unnecessary columns
  dplyr::select(-c(announced_post_ira, project_announcement_date_original))

# Save processed data for credits analysis
write_csv(g, here("data", "inter", "turner_processed.csv"))
message("Successfully processed and saved Turner data for credits analysis.") 
